/*
 * Copyright (c) 2006-2020, RT-Thread Development Team
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Date           Author       Notes
 * 2020-01-15     bigmagic     the first version
 * 2020-08-10     SummerGift   support clang compiler
 * 2023-04-29     GuEe-GUI     support kernel's ARM64 boot header
 */

#include "rtconfig.h"

    .section ".text.entrypoint","ax"

#ifdef RT_USING_OFW
/*
 * Our goal is to boot the rt-thread as possible without modifying the
 * bootloader's config, so we use the kernel's boot header for ARM64:
 *   https://www.kernel.org/doc/html/latest/arm64/booting.html#call-the-kernel-image
 */
_head:
    b       _start          /* Executable code */
    .long   0               /* Executable code */
    .quad   _text_offset    /* Image load offset from start of RAM, little endian */
    .quad   _end - _head    /* Effective Image size, little endian (_end defined in link.lds) */
    .quad   0xa             /* Kernel flags, little endian */
    .quad   0               /* Reserved */
    .quad   0               /* Reserved */
    .quad   0               /* Reserved */
    .ascii  "ARM\x64"       /* Magic number */
    .long   0               /* Reserved (used for PE COFF offset) */
#endif
/* Variable registers: x21~x28 */
dtb_paddr .req x21
boot_arg0 .req x22
boot_arg1 .req x23
boot_arg2 .req x24
stack_top .req x25

    .global _start
_start:
/*
 * Boot CPU general-purpose register settings:
 *   x0 = physical address of device tree blob (dtb) in system RAM.
 *   x1 = 0 (reserved for future use)
 *   x2 = 0 (reserved for future use)
 *   x3 = 0 (reserved for future use)
 */
    mov     dtb_paddr, x0
    mov     boot_arg0, x1
    mov     boot_arg1, x2
    mov     boot_arg2, x3
#ifdef ARCH_ARM_BOOTWITH_FLUSH_CACHE
    bl      __asm_flush_dcache_all
#endif
    bl      rt_hw_cpu_id_set
    /* read cpu id, stop slave cores */
    mrs     x0, tpidr_el1
    cbz     x0, .L__cpu_0           /* .L prefix is the local label in ELF */

#ifndef RT_AMP_SLAVE
    /* cpu id > 0, stop */
    /* cpu id == 0 will also goto here after returned from entry() if possible */
.L__current_cpu_idle:
    wfe
    b       .L__current_cpu_idle
#endif

.L__cpu_0:
    /* set stack before our code, Define stack pointer for current exception level */
    adr     x1, .el_stack_top

    /* set up EL1 */
    mrs     x0, CurrentEL           /* CurrentEL Register. bit 2, 3. Others reserved */
    and     x0, x0, #12             /* clear reserved bits */

    /* running at EL3? */
    cmp     x0, #12                 /* 1100b. So, EL3 */
    bne     .L__not_in_el3          /* 11?  !EL3 -> 5: */

    /* should never be executed, just for completeness. (EL3) */
    mov     x2, #0x5b1
    msr     scr_el3, x2             /* SCR_ELn  Secure Configuration Register */
    mov     x2, #0x3c9
    msr     spsr_el3, x2            /* SPSR_ELn. Saved Program Status Register. 1111001001 */
    adr     x2, .L__not_in_el3
    msr     elr_el3, x2
    eret                            /* Exception Return: from EL3, continue from .L__not_in_el3 */

.L__not_in_el3:                     /* running at EL2 or EL1 */
    cmp     x0, #4                  /* 0x04  0100 EL1 */
    beq     .L__in_el1              /* EL1 -> 5: */

    mrs     x0, hcr_el2
    bic     x0, x0, #0xff
    msr     hcr_el2, x0

    msr     sp_el1, x1              /* in EL2, set sp of EL1 to _start */

    /* enable CNTP for EL1 */
    mrs     x0, cnthctl_el2         /* Counter-timer Hypervisor Control register */
    orr     x0, x0, #3
    msr     cnthctl_el2, x0
    msr     cntvoff_el2, xzr

    /* enable AArch64 in EL1 */
    mov     x0, #(1 << 31)          /* AArch64 */
    orr     x0, x0, #(1 << 1)       /* SWIO hardwired on Pi3 */
    msr     hcr_el2, x0
    mrs     x0, hcr_el2

    /* change execution level to EL1 */
    mov     x2, #0x3c4
    msr     spsr_el2, x2            /* 1111000100 */
    adr     x2, .L__in_el1
    msr     elr_el2, x2

    eret                            /* exception return. from EL2. continue from .L__in_el1 */

.macro GET_PHY reg, symbol
    adrp    \reg, \symbol
    add     \reg, \reg, #:lo12:\symbol
.endm

.L__in_el1:
    mov     sp, x1                  /* in EL1. Set sp to _start */

    /* Set CPACR_EL1 (Architecture Feature Access Control Register) to avoid trap from SIMD or float point instruction */
    mov     x1, #0x00300000         /* Don't trap any SIMD/FP instructions in both EL0 and EL1 */
    msr     cpacr_el1, x1
    /* applying context change */
    dsb     ish
    isb

    /* clear bss */
    GET_PHY x1, __bss_start
    GET_PHY x2, __bss_end
    sub     x2, x2, x1              /* get bss size */

    and     x3, x2, #7              /* x3 is < 7 */
    ldr     x4, =~0x7
    and     x2, x2, x4              /* mask ~7 */

.L__clean_bss_loop:
    cbz     x2, .L__clean_bss_loop_1
    str     xzr, [x1], #8
    sub     x2, x2, #8
    b       .L__clean_bss_loop

.L__clean_bss_loop_1:
    cbz     x3, .L__jump_to_entry
    strb    wzr, [x1], #1
    sub     x3, x3, #1
    b       .L__clean_bss_loop_1

.L__jump_to_entry:          /* jump to C code, should not return */
    bl mmu_tcr_init

    bl get_ttbrn_base
    add x1, x0, #0x1000

    msr ttbr0_el1, x0
    msr ttbr1_el1, x1
    dsb sy

#ifdef RT_USING_SMART
    ldr     x2, =_start
    GET_PHY x3, _start
    sub     x3, x3, x2
#else
    mov     x3,0
#endif

    ldr x2, =0x10000000     /* map 256M memory for kernel space */
    bl rt_hw_mem_setup_early

    ldr x30, =after_mmu_enable  /* set LR to after_mmu_enable function, it's a v_addr */

    mrs x1, sctlr_el1
    bic x1, x1, #(3 << 3)    /* dis SA, SA0 */
    bic x1, x1, #(1 << 1)    /* dis A */
    orr x1, x1, #(1 << 12)   /* I */
    orr x1, x1, #(1 << 2)    /* C */
    orr x1, x1, #(1 << 0)    /* M */
    msr sctlr_el1, x1        /* enable MMU */

    dsb ish
    isb
    ic ialluis               /* Invalidate all instruction caches in Inner Shareable domain to Point of Unification */
    dsb ish
    isb
    tlbi vmalle1             /* Invalidate all stage 1 translations used at EL1 with the current VMID */
    dsb ish
    isb
    ret

after_mmu_enable:
#ifdef RT_USING_SMART
    mrs x0, tcr_el1          /* disable ttbr0, only using kernel space */
    orr x0, x0, #(1 << 7)
    msr tcr_el1, x0
    msr ttbr0_el1, xzr
    dsb sy
#endif

    mov     x0, #1
    msr     spsel, x0
    adr     x1, .el_stack_top
    mov     sp, x1           /* sp_el1 set to _start */

    b  rtthread_startup

#ifdef RT_USING_SMP
/**
 *  secondary cpu
 */

.global _secondary_cpu_entry
_secondary_cpu_entry:
    bl      rt_hw_cpu_id_set
    adr     x1, .el_stack_top

    /* set up EL1 */
    mrs     x0, CurrentEL           /* CurrentEL Register. bit 2, 3. Others reserved */
    and     x0, x0, #12             /* clear reserved bits */

    /* running at EL3? */
    cmp     x0, #12                 /* 1100b. So, EL3 */
    bne     .L__not_in_el3_cpux          /* 11?  !EL3 -> 5: */

    /* should never be executed, just for completeness. (EL3) */
    mov     x2, #0x5b1
    msr     scr_el3, x2             /* SCR_ELn  Secure Configuration Register */
    mov     x2, #0x3c9
    msr     spsr_el3, x2            /* SPSR_ELn. Saved Program Status Register. 1111001001 */
    adr     x2, .L__not_in_el3_cpux
    msr     elr_el3, x2
    eret                            /* Exception Return: from EL3, continue from .L__not_in_el3 */

.L__not_in_el3_cpux:                     /* running at EL2 or EL1 */
    cmp     x0, #4                  /* 0x04  0100 EL1 */
    beq     .L__in_el1_cpux              /* EL1 -> 5: */

    mrs     x0, hcr_el2
    bic     x0, x0, #0xff
    msr     hcr_el2, x0

    msr     sp_el1, x1              /* in EL2, set sp of EL1 to _start */

    /* enable CNTP for EL1 */
    mrs     x0, cnthctl_el2         /* Counter-timer Hypervisor Control register */
    orr     x0, x0, #3
    msr     cnthctl_el2, x0
    msr     cntvoff_el2, xzr

    /* enable AArch64 in EL1 */
    mov     x0, #(1 << 31)          /* AArch64 */
    orr     x0, x0, #(1 << 1)       /* SWIO hardwired on Pi3 */
    msr     hcr_el2, x0
    mrs     x0, hcr_el2

    /* change execution level to EL1 */
    mov     x2, #0x3c4
    msr     spsr_el2, x2            /* 1111000100 */
    adr     x2, .L__in_el1_cpux
    msr     elr_el2, x2

    eret                            /* exception return. from EL2. continue from .L__in_el1 */

.L__in_el1_cpux:
    mrs     x0, tpidr_el1
    /* each cpu init stack is 8k */
    sub     x1, x1, x0, lsl #13
    mov     sp, x1                  /* in EL1. Set sp to _start */

    /* Set CPACR_EL1 (Architecture Feature Access Control Register) to avoid trap from SIMD or float point instruction */
    mov     x1, #0x00300000         /* Don't trap any SIMD/FP instructions in both EL0 and EL1 */
    msr     cpacr_el1, x1

.L__jump_to_entry_cpux:                   /* jump to C code, should not return */

    /* init mmu early */

    bl mmu_tcr_init

    bl get_ttbrn_base
    add x1, x0, #0x1000

    msr ttbr0_el1, x0
    msr ttbr1_el1, x1
    dsb sy

    ldr x30, =after_mmu_enable_cpux  /* set LR to after_mmu_enable function, it's a v_addr */

    mrs x1, sctlr_el1
    bic x1, x1, #(3 << 3)    /* dis SA, SA0 */
    bic x1, x1, #(1 << 1)    /* dis A */
    orr x1, x1, #(1 << 12)   /* I */
    orr x1, x1, #(1 << 2)    /* C */
    orr x1, x1, #(1 << 0)    /* M */
    msr sctlr_el1, x1        /* enable MMU */

    dsb sy
    isb sy
    ic ialluis               /* Invalidate all instruction caches in Inner Shareable domain to Point of Unification */
    dsb sy
    isb sy
    tlbi vmalle1             /* Invalidate all stage 1 translations used at EL1 with the current VMID */
    dsb sy
    isb sy
    ret

after_mmu_enable_cpux:
#ifdef RT_USING_SMART
    mrs x0, tcr_el1          /* disable ttbr0, only using kernel space */
    orr x0, x0, #(1 << 7)
    msr tcr_el1, x0
    msr ttbr0_el1, xzr
    dsb sy
#endif

    mov     x0, #1
    msr     spsel, x0
    mrs     x0, tpidr_el1
    /* each cpu init stack is 8k */
    adr     x1, .el_stack_top
    sub     x1, x1, x0, lsl #13
    mov     sp, x1                  /* in EL1. Set sp to _start */

    b rt_hw_secondary_cpu_bsp_start
#endif

#ifndef RT_CPUS_NR
#define RT_CPUS_NR 1
#endif

.align 12
.el_stack:
.space (8192 * RT_CPUS_NR)
.el_stack_top:
